library(pROC)
library(ggpubr)
library(Biostrings)
## Warning: package 'BiocGenerics' was built under R version 4.0.5
library(data.table)
library(dplyr)
library(purrr)
library(tidyverse)
library(yardstick)
library(doParallel)
library(foreach)
library(stringdist)
library(caret)
# Function to calculate the F score, given beta, precision and recall.
calculate_f_beta = function(beta, precision, recall) {
return((beta^2 + 1)*(precision*recall / ((beta^2)*precision + recall)))
}
# Function to provide a closest match. Used to match HLA Alleles across mixed output styles.
ClosestMatch2 = function(string, stringVector){
stringVector[amatch(string, stringVector, maxDist=Inf)]
}
FullDataset=readRDS("CoV2_testing_dataset_filtered.rds")
TEST_DATA_LOCATION="SARS_COV_2_DATASET/IEDB_OTB/"
# For each allele in the data
foreach(allele_i=1:length(unique(FullDataset$HLA_Allele)))%do%{
# Clean the allele text
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
# Write respective data to file
testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_test_data.txt")
write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Define the output file for the predictions
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_Results.txt")
# Run the model for allele x.
system(paste0("python IEDB_Immunogenicity_Model_Calis/immunogenicity_model/predict_immunogenicity.py ",testdata,
" --allele=",HLA_ALLELE_FOR_TESTING," > ",RESULTS_OUTPUT))
}
TEST_DATA_LOCATION="SARS_COV_2_DATASET/IEDB_OTB/"
files <- dir(TEST_DATA_LOCATION, pattern = "*_Results")
# Read in all the Results files.
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(TEST_DATA_LOCATION, .)))
)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
IEDB_RESULTS <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# Files are output from IEDB per allele and saved with the allele in the file name. Allele is not output in the data, so the below extracts allele information into a column from the file name
IEDB_RESULTS$file=gsub(x=IEDB_RESULTS$file,pattern="_Results.txt",replacement="")
IEDB_RESULTS=IEDB_RESULTS %>% mutate(HLA_Allele = gsub(x=IEDB_RESULTS$file,pattern="Allele_",replacement = ""))
# Map the allele in model output to allele nomenclature in our test dataset
IEDB_RESULTS$HLA_Allele = ClosestMatch2(IEDB_RESULTS$HLA_Allele,unique(FullDataset$HLA_Allele))
# Clean the data table and join it with the original full dataset
IEDB_RESULTS=IEDB_RESULTS %>% dplyr::rename(Peptide=peptide, ImmunogenicityScore=score) %>% select(!file)
IEDB_RESULTS=IEDB_RESULTS %>% inner_join(FullDataset,by=c("Peptide","HLA_Allele")) %>% select(!length) %>% mutate(Dataset = "IEDB")
print(c("IEDB Results has same number of rows as test dataset? : ",IEDB_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "IEDB Results has same number of rows as test dataset? : "
## [2] "TRUE"
TEST_DATA_LOCATION="SARS_COV_2_DATASET/NETTEPI_OTB/"
# For each allele
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
# Clean the allele text: can't create a file with * or : in the name.
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":|\\*",replacement = "")
testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_test_data.txt")
# Filter the data for the run
data_for_run = FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])
# What lengths are used on this run?
lengths = data_for_run %>% mutate(Length=Biostrings::width(Peptide))%>% pull(Length)%>% unique
# Write out data for this run
write.table(data_for_run %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_Results.xls")
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern="\\*",replacement = "")
system(paste0("/Applications/netTepi-1.0_orig/netTepi -a ",HLA_ALLELE_FOR_TESTING," -p ",testdata," -xlsfile ",RESULTS_OUTPUT," -l ",paste0(lengths,collapse = ",")))
# Change .xls extension to .csv
system(paste0("mv ",RESULTS_OUTPUT," " ,gsub(x=RESULTS_OUTPUT,pattern=".xls",replacement=""),".csv"))
}
# Read output files in and combine them
TEST_DATA_LOCATION = "SARS_COV_2_DATASET/NETTEPI_OTB"
files <- dir(TEST_DATA_LOCATION, pattern = "_Results.csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(TEST_DATA_LOCATION, .)))
)
# Get rid of columns we dont need.
NetTepi_Results <- unnest(data3) %>% as.data.table %>% dplyr::select(!c(Pos,Identity,Aff,Stab,Tcell,`%Rank`,file))
# Combined score becomes the 'immunogenicity score'.
NetTepi_Results=NetTepi_Results %>% dplyr::rename(ImmunogenicityScore=Comb,HLA_Allele=Allele) #Use the 'combined' score for analysis
#Allele formatting as input and output is different, so we match the alleles between NetTepi output and our test dataset by similarity.
NetTepi_Results$HLA_Allele = ClosestMatch2(NetTepi_Results$HLA_Allele,unique(FullDataset$HLA_Allele))
# Below confirms that the above matching works as otherwise the correct number of rows would not be joined by peptide-HLA
NetTepi_Results=NetTepi_Results %>% inner_join(FullDataset,by=c("Peptide","HLA_Allele"))
print(c("NetTepi Results has same number of rows as test dataset? : ",NetTepi_Results %>% nrow == FullDataset %>% nrow))
## [1] "NetTepi Results has same number of rows as test dataset? : "
## [2] "TRUE"
NetTepi_Results=NetTepi_Results%>% select(!Epitopes) %>% mutate(Dataset = 'NETTEPI')
# Write data to file and run below script to train model OTB and process it
FullDataset %>% select(Peptide) %>% write.table(file="COV2_peptides_for_OTB_analysis.txt",quote=F,row.names = F)
source("run_ipred_OTB_CoV2.R")
IPRED_RESULTS=fread("SARS_COV_2_DATASET/IPRED_OTB/IPRED_RESULTS.txt")%>%dplyr::rename(Peptide=antigen.epitope,ImmunogenicityScore=imm.prob)
IPRED_RESULTS=IPRED_RESULTS %>% inner_join(FullDataset)
## Joining, by = "Peptide"
print(c("IPRED Results has same number of rows as test dataset? : ",IPRED_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "IPRED Results has same number of rows as test dataset? : "
## [2] "TRUE"
IPRED_RESULTS=IPRED_RESULTS %>% mutate(Dataset = "IPRED")
##Repitope - Output data in a format readable by Repitope
FullDataset %>% dplyr::rename(MHC=HLA_Allele) %>% mutate(Dataset = "CoV2_Eps") %>% write.csv(file = "CoV2_TestData_ForRepitope.csv",quote=F,row.names = F)
# Compute the features for our test dataset and write them to FST file
#source("compute_repitope_features.R")
#source("run_repitope_OTB.R")
REPITOPE_RESULTS = fread("SARS_COV_2_DATASET/REPITOPE_OTB/ALL_PREDICTIONS.csv")
print(c("REPITOPE Results has same number of rows as test dataset? : ",REPITOPE_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "REPITOPE Results has same number of rows as test dataset? : "
## [2] "FALSE"
REPITOPE_RESULTS=REPITOPE_RESULTS %>% full_join(FullDataset) %>% select(!"ImmunogenicityScore.cv")
REPITOPE_RESULTS= REPITOPE_RESULTS %>% mutate(Dataset = "REPITOPE")
print(c("REPITOPE Results has same number of rows as test dataset? : ",REPITOPE_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "REPITOPE Results has same number of rows as test dataset? : "
## [2] "TRUE"
# For each allele
TEST_DATA_LOCATION = "SARS_COV_2_DATASET/NETMHCPAN_4_L/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
# Clean HLA and find lengths of test peptides
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern="\\*",replacement = "")
LENGTHS=FullDataset %>% mutate(Length = nchar(Peptide)) %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])%>% pull(Length) %>% unique
testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
# Write test peptides to file for reading into netMHCpan
write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
system(paste0("/Applications/netMHCpan-4.0/netMHCpan -p ",testdata," -a ",HLA_ALLELE_FOR_TESTING," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))
}
# Read in and process
TEST_DATA_LOCATION = "SARS_COV_2_DATASET/NETMHCPAN_4_L/"
data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(data_path, .),skip = 1))
)
Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# Extract allele from file name
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
# Map HLA allele nomenclature
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))
# Join with test dataset
Netmhcpanres=Netmhcpanres %>% select(! c(file,Pos,ID,core,icore))%>% inner_join( FullDataset)
## Joining, by = c("Peptide", "HLA_Allele")
Netmhcpanres %>% nrow
## [1] 878
# Munge the data table
Netmhcpanres=Netmhcpanres %>% select(Peptide, HLA_Allele,Immunogenicity,Ave,ImmunogenicityCont) %>% dplyr::rename(ImmunogenicityScore = Ave)%>% mutate(Dataset = "netMHCpan_EL")
TEST_DATA_LOCATION = "SARS_COV_2_DATASET/NETMHCPAN_4_BA/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern="\\*",replacement = "")
LENGTHS=FullDataset %>% mutate(Length = nchar(Peptide)) %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])%>% pull(Length) %>% unique
testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
system(paste0("/Applications/netMHCpan-4.0/netMHCpan -BA -p ",testdata," -a ",HLA_ALLELE_FOR_TESTING," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))
}
TEST_DATA_LOCATION = "SARS_COV_2_DATASET/NETMHCPAN_4_BA/"
data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(data_path, .),skip = 1))
)
NetmhcpanresBA <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
NetmhcpanresBA=NetmhcpanresBA %>% mutate(HLA_Allele = gsub(x=NetmhcpanresBA$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
NetmhcpanresBA$HLA_Allele = ClosestMatch2(NetmhcpanresBA$HLA_Allele,unique(FullDataset$HLA_Allele))
NetmhcpanresBA=NetmhcpanresBA %>% inner_join( FullDataset)
## Joining, by = c("Peptide", "HLA_Allele")
NetmhcpanresBA=NetmhcpanresBA %>% select(Peptide, HLA_Allele,Immunogenicity,Ave, ImmunogenicityCont) %>% dplyr::rename(ImmunogenicityScore = Ave)%>% mutate(Dataset = "netMHCpan_BA")
TEST_DATA_LOCATION="SARS_COV_2_DATASET/PRIMEOTB/"
# Model apparently can't deal with spaces in full path so save data to ~Documents/PRIMEDATA/x
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":|\\*||-|HLA",replacement = "")
testdata=paste0("~/Documents/PRIMEDATA/",HLA_ALLELE_FOR_TESTING,".txt")
data_for_run = FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])
write.table(data_for_run %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0("~/Documents/PRIMEDATA/",HLA_ALLELE_FOR_TESTING,"Results.txt")
system(paste0("/Applications/PRIME-master/PRIME -i ",testdata," -a ",HLA_ALLELE_FOR_TESTING," -mix /Applications/MixMHCpred-2.1/MixMHCpred"," -o ", RESULTS_OUTPUT))
}
TEST_DATA_LOCATION = "SARS_COV_2_DATASET/PRIME_OTB"
files <- dir(TEST_DATA_LOCATION, pattern = "Results.txt")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(TEST_DATA_LOCATION, .)))
)
PRIME_RESULTS <- unnest(data3) %>% as.data.table %>% select(Peptide,BestAllele,Score_bestAllele)
PRIME_RESULTS=PRIME_RESULTS %>% dplyr::rename(ImmunogenicityScore=Score_bestAllele,HLA_Allele=BestAllele)
PRIME_RESULTS$HLA_Allele = ClosestMatch2(PRIME_RESULTS$HLA_Allele, unique(FullDataset$HLA_Allele))
PRIME_RESULTS=PRIME_RESULTS %>% inner_join(FullDataset,by=c("Peptide","HLA_Allele"))
print(c("PRIME_RESULTS Results has same number of rows as test dataset? : ",PRIME_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "PRIME_RESULTS Results has same number of rows as test dataset? : "
## [2] "TRUE"
PRIME_RESULTS=PRIME_RESULTS %>% mutate(Dataset = "PRIME")
# Output the data for use on the webserver.
FullDataset %>% mutate(Length = width(Peptide)) %>% filter(Length %in% c(9,10)) %>% select(Peptide, HLA_Allele) %>% mutate(HLA_Allele = gsub("\\:","",HLA_Allele)) %>% readr::write_csv(file="OUT_DEEPIMMUNO.csv",col_names = FALSE)
# Read in the webserver results
DEEPIMM=fread("SARS_COV_2_DATASET/DEEPIMMUNO_OTB/result.txt") %>% dplyr::rename(Peptide =peptide, HLA_Allele=HLA,ImmunogenicityScore=immunogenicity)
# Match the HLAs
DEEPIMM$HLA_Allele = ClosestMatch2(DEEPIMM$HLA_Allele,unique(FullDataset$HLA_Allele))
DEEPIMM %>% nrow
## [1] 878
DEEPIMM=DEEPIMM %>% inner_join(FullDataset)%>%mutate(Dataset = "DeepImmuno")
## Joining, by = c("Peptide", "HLA_Allele")
TEST_DATA_LOCATION= "SARS_COV_2_DATASET/GAO_OTB"
files <- dir(TEST_DATA_LOCATION, pattern = ".csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(TEST_DATA_LOCATION, .)))
)
GAO_RESULTS <- unnest(data3) %>% as.data.table
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
FullDataset %>% nrow
## [1] 878
FullDataset %>% left_join(GAO_RESULTS %>% dplyr::rename(Peptide=peptide, HLA_Allele=HLA)) %>% nrow
## Joining, by = c("Peptide", "HLA_Allele")
## [1] 878
GAO_RESULTS=FullDataset %>% left_join(GAO_RESULTS %>% dplyr::rename(Peptide=peptide, HLA_Allele=HLA))
## Joining, by = c("Peptide", "HLA_Allele")
GAO_RESULTS=GAO_RESULTS %>% select(!file) %>% dplyr::rename(ImmunogenicityScore=amplitude) %>% select(!immunogenic)%>% mutate(Dataset = "GAO")
# Bind all the model results together
combinedData = rbind(IEDB_RESULTS,NetTepi_Results,IPRED_RESULTS,REPITOPE_RESULTS,PRIME_RESULTS, Netmhcpanres,NetmhcpanresBA,GAO_RESULTS,DEEPIMM)
ALLOWEDLENGTHS = c(9,10) # Does not filter anything in this setting.
combinedData = combinedData%>% mutate(Length = width(Peptide)) %>% filter(Length %in% ALLOWEDLENGTHS)%>% select(!Length)
combinedData %>% nrow
## [1] 7902
# Confirm all have 878 obs
combinedData %>% select(Dataset) %>% table
## .
## DeepImmuno GAO IEDB IPRED NETTEPI PRIME
## 878 878 878 878 878 878
## REPITOPE netMHCpan_BA netMHCpan_EL
## 878 878 878
AUCDF = combinedData %>% group_by(Dataset) %>% dplyr::summarise(ROC=as.numeric(roc(Immunogenicity ~ ImmunogenicityScore)$auc))
# use 'roc' function from pROC to generate roc curves for each model
NETTEPIAUC=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'NETTEPI'))
IPREDAUC=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'IPRED'))
IEDBMODELAUC=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'IEDB'))
REPITOPE_AUC_CV=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'REPITOPE'))
PRIME_AUC_CV = roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'PRIME'))
DEEP_IMM_AUC = roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'DeepImmuno'))
NETMHCPAN_IMM_AUC = roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'netMHCpan_EL'))
NETMHCPAN_IMM_BA_AUC = roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'netMHCpan_BA'))
GAO_AUC = roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'GAO'))
# Use GGROC to combine and visualise the ROC-AUC curves
roc_AUC=ggroc(list(IEDB_Model=IEDBMODELAUC,iPred=IPREDAUC,NetTepi=NETTEPIAUC,REpitope=REPITOPE_AUC_CV,PRIME=PRIME_AUC_CV,DeepImmuno=DEEP_IMM_AUC,netMHCpan_EL=NETMHCPAN_IMM_AUC,netMHCpan_BA=NETMHCPAN_IMM_BA_AUC,GAO=GAO_AUC),legacy.axes = TRUE,size=1.25) + theme_bw() +
annotate("size"=4,"text",x=.80,y=.19,label=paste0("IEDB_Model: ",round(auc(IEDBMODELAUC),digits=3),"\n","iPred: ",round(auc(IPREDAUC),digits=3),"\n","NetTepi: ",round(auc(NETTEPIAUC),digits=3),"\n","REpitope: ",round(auc(REPITOPE_AUC_CV),digits=3), "\n","PRIME: ",round(auc(PRIME_AUC_CV),digits = 3), "\n","DeepImmuno: ", round(auc(DEEP_IMM_AUC),digits = 3), "\n","netMHCpan_EL: ", round(auc(NETMHCPAN_IMM_AUC),digits = 3), "\n","netMHCpan_BA: ", round(auc(NETMHCPAN_IMM_BA_AUC),digits = 3), "\n","GAO: ", round(auc(GAO_AUC),digits = 3))) + font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.title",color="white") + font("legend.text",size=14)+ geom_abline(size=1,intercept = 0, slope = 1,color = "darkgrey", linetype = "dashed")+theme(panel.background = element_rect(colour = "black", size=0.5))+ coord_fixed(xlim = 0:1, ylim = 0:1)#+ggtitle("ROC Curves")
# Set factor levels
combinedData$Immunogenicity = factor(combinedData$Immunogenicity,levels = c("Positive","Negative"))
# Create 'DATA_FOR_PR'. This is for plotting. We modify the model name labels and colour labels to ensure consistency between ROC-AUC and PR-AUC plots
DATA_FOR_PR = combinedData
#Calculate the real praucs
PR_AUC_COMBINED=combinedData %>% group_by(Dataset) %>% pr_auc(Immunogenicity,ImmunogenicityScore)
PR_AUC_COMBINED$.estimate=round(PR_AUC_COMBINED$.estimate,digits=3)
# Change model labels to ensure consistency between PR-AUC and ROC-AUC
DATA_FOR_PR[DATA_FOR_PR$Dataset == 'IEDB',]$Dataset = "IEDB_Model"
DATA_FOR_PR[DATA_FOR_PR$Dataset == 'IPRED',]$Dataset = "iPred"
DATA_FOR_PR[DATA_FOR_PR$Dataset == 'NETTEPI',]$Dataset = "NetTepi"
DATA_FOR_PR[DATA_FOR_PR$Dataset == 'REPITOPE',]$Dataset = "REpitope"
# Produce and plot PR-AUC curve
pr_AUC=DATA_FOR_PR %>% group_by(Dataset) %>%
mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO"))) %>% pr_curve(Immunogenicity,ImmunogenicityScore) %>%
autoplot() + aes(size = Dataset)+scale_size_manual(values=c(1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25)) +annotate("size"=4,"text",x=.8,y=.19,label=paste0("IEDB_Model: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'IEDB')%>%pull(".estimate"),"\n","iPred: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'IPRED')%>%pull(".estimate"),"\n","NetTepi: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'NETTEPI')%>%pull(".estimate"),"\n","REpitope: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'REPITOPE')%>%pull(".estimate"),"\n","PRIME: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'PRIME')%>%pull(".estimate"),"\n","DeepImmuno: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'DeepImmuno')%>%pull(".estimate"),"\n","netMHCpan_EL: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'netMHCpan_EL')%>%pull(".estimate"),"\n","netMHCpan_BA: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'netMHCpan_BA')%>%pull(".estimate"),"\n","GAO: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'GAO')%>%pull(".estimate") )) + geom_hline(size=1,color="darkgrey",yintercept = nrow(FullDataset[FullDataset$Immunogenicity=='Positive',]) / nrow(FullDataset),linetype="dashed")+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.title",color="white") + font("legend.text",size=14)+theme(panel.background = element_rect(colour = "black", size=0.5))+ coord_fixed(xlim = 0:1, ylim = 0:1)
#use cowplot to organise the plots
GRID1= cowplot::plot_grid(roc_AUC)
GRID2= cowplot::plot_grid(pr_AUC)
cowplot::plot_grid(GRID1,GRID2,align="hv")
set.seed(41)
DATA_FOR_ROC=DATA_FOR_PR
#setup parallel backend to use multiple processors
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
ROC_AUC_RAND.DIST=foreach(i = 1:1000, .combine = rbind,.packages = c("dplyr","magrittr","pROC","data.table")) %dopar% {
DATA_FOR_ROC %>% group_by(Dataset)%>%
mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO"))) %>% mutate(Shuffled_Immunogenicity=sample(size=n(),Immunogenicity)) %>% dplyr::summarise(ROC=as.numeric(roc(Shuffled_Immunogenicity ~ ImmunogenicityScore)$auc)) %>% mutate(sampleNum=i)
}
stopCluster(cl)
ROC_AUC_COMBINED=DATA_FOR_ROC%>%
mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO"))) %>% group_by(Dataset) %>% dplyr::summarise(ROC=as.numeric(roc(Immunogenicity ~ ImmunogenicityScore)$auc))
ROC_RAND_DIST_FIG=ROC_AUC_RAND.DIST%>%
mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO"))) %>% ggdensity(x="ROC",y="..density..",fill="Dataset",add="mean",color = "Dataset",alpha=0.3) +theme_pubr(base_size = 18)+ facet_wrap(~Dataset) + geom_vline(data=ROC_AUC_COMBINED,aes(xintercept=ROC),color="black",linetype="dashed") + xlab("Area under the ROC curve") + theme(legend.position = "none")+rotate_x_text(angle=90)
ROC_RAND_DIST_FIG
ROC_AUC_RAND.DIST %>% group_by(Dataset) %>% dplyr::summarise(Random_mean=mean(ROC),sd=sd(ROC)) %>% inner_join(ROC_AUC_COMBINED) %>% dplyr::rename(Predicted=ROC) %>% mutate(zscore = round(((Predicted-Random_mean)/sd),2) ) %>% DT::datatable(caption="Mean, sd and zscore to show distance from mean of random distribution")
## Joining, by = "Dataset"
#setup parallel backend to use multiple processors
set.seed(41)
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
PR_AUC_RAND.DIST=foreach(i = 1:1000, .combine = rbind,.packages = c("dplyr","magrittr","yardstick","data.table")) %dopar% {
DATA_FOR_PR %>% group_by(Dataset) %>%
mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO"))) %>% mutate(Shuffled_Immunogenicity=sample(size=n(),Immunogenicity)) %>% mutate(Shuffled_Immunogenicity=factor(Shuffled_Immunogenicity,levels = c("Positive","Negative"))) %>%
pr_auc(Shuffled_Immunogenicity,ImmunogenicityScore) %>% mutate(sampleNum=i)
}
stopCluster(cl)
PR_AUC_COMBINED=DATA_FOR_PR %>% mutate(Immunogenicity=factor(Immunogenicity,levels = c("Positive","Negative"))) %>%
mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO"))) %>% group_by(Dataset) %>% pr_auc(Immunogenicity,ImmunogenicityScore)
PR_RAND_DIST_FIG=PR_AUC_RAND.DIST %>%
mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO"))) %>% ggdensity(x=".estimate",y="..density..",fill="Dataset",add="mean",color = "Dataset",alpha=0.3)+theme_pubr(base_size = 18) + facet_wrap(~Dataset) + geom_vline(data=PR_AUC_COMBINED,aes(xintercept=.estimate),color="black",linetype="dashed") + xlab("Area under the precision-recall curve") + theme(legend.position = "none")+rotate_x_text(angle=90)
PR_RAND_DIST_FIG
cowplot::plot_grid(ROC_RAND_DIST_FIG, PR_RAND_DIST_FIG, nrow=1,align="hv",axis="bt")
PR_AUC_RAND.DIST %>% group_by(Dataset) %>% dplyr::summarise(Random_mean=mean(.estimate),sd=sd(.estimate)) %>% inner_join(PR_AUC_COMBINED %>% select(!c(.metric,.estimator))) %>% dplyr::rename(Predicted=.estimate) %>% mutate(zscore = round(((Predicted-Random_mean)/sd),2) ) %>% DT::datatable(caption="Mean, sd and zscore to show distance from mean of random distribution")
## Joining, by = "Dataset"
foreach(i = 1:length(unique(combinedData$Dataset)))%do% {
MODEL = unique(combinedData$Dataset)[i]
ROC_MODEL=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% MODEL))
threshold = coords(roc=ROC_MODEL, x="best", input="threshold", best.method="youden", transpose=F)$threshold
threshold_data = combinedData %>% filter(Dataset %in% MODEL)%>% mutate(ImmunogenicityPrediction = ifelse(ImmunogenicityScore > threshold, "Positive","Negative"))
CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(threshold_data %>% select(Immunogenicity) %>% pull(),
levels = c("Negative","Positive")),
factor(threshold_data %>% select(ImmunogenicityPrediction) %>% pull(),
levels=c("Negative","Positive")))
table=data.frame(CM$table)
plotTable <- table %>%
mutate(Performance = ifelse(table$Prediction == table$Reference, "Accurate", "Inaccurate")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
CMplot=ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = Performance, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1,size=8) +
scale_fill_manual(values = c(Accurate = "green", Inaccurate = "red")) +
theme_bw() +
xlim(rev(levels(table$Reference))) + ggtitle(MODEL)+ font("xy.text",size=18,color="black")+ font("xlab",size=18,color="black")+ font("ylab",size=18,color="black") + theme(plot.title = element_text(size=18))+ font("legend.text",size=12)
#plot(CMplot)
}
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]